import docx
import pandas as pd
from pdf2docx import Converter
import re

pdf_file = r"C:\Users\kf40\Desktop\智能提取信息\1617148857055-dongfanghongwenjianjingxuanhunhexingzhengquantouzijijin2020nianniandubaogao.pdf"
docx_file = r'C:\Users\kf40\Desktop\智能提取信息\sample01.docx'

# convert pdf to docx
cv = Converter(pdf_file)
cv.convert(docx_file, start=0, end=None)
cv.close()

# 获取文档对象
file = docx.Document(docx_file)
print("段落数:" + str(len(file.paragraphs)))  # 段落数为13每个回车隔离一段

tables = file.tables
keys = ['基金名称', "基金简称", "场内简称", "基金主代码", '基金类别', '基金管理人', '基金托管人', '基金合同生效日', '基金运作方式', '业绩比较基准', '投资目标', '投资范围',
        '投资比例', '基金分红', '风险收益特征', '风险等级']
result = {}
for i in range(len(tables)):
    tb = tables[i]
    # 获取表格的行
    tb_rows = tb.rows
    # 读取每一行内容
    for i in range(len(tb_rows)):
        row_data = []
        row_cells = tb_rows[i].cells
        # 读取每一行单元格内容
        for cell in row_cells:
            # 单元格内容
            row_data.append(cell.text.replace(" ", ""))
        if row_data[0] in keys:
            result[row_data[0]] = row_data[1]
temp = result["基金名称"]

if "竞争力指数发起式证券投资基金" in temp:
    result["基金类别"] = "股票型投资基金"
if "混合型" in temp:
    if "FOF" in temp:
        result["基金类别"] = "混合型基金中基金"
    else:
        result["基金类别"] = "混合型证券投资基金"
if "债券型" in temp:
    result["基金类别"] = "债券型证券投资基金"
if "货币市场" in temp:
    result["基金类别"] = "货币市场基金"
for i in range(len(file.paragraphs)):
    #     print("第"+str(i)+"段的内容是："+file.paragraphs[i].text)
    temp = file.paragraphs[i].text
    if '管理人对报告期内基金利润分配情况的说明' in temp:
        start = i
    if '报告期内管理人对本基金持有人数或基金资产净值预警情形的说明' in temp:
        end = i
text = ""
for i in range(start + 1, end - 2):
    text = text + file.paragraphs[i].text
import re

text = re.sub("第.*页.*年度报告", "", text)
result["基金分红"] = text
for i in range(len(file.paragraphs)):
    #     print("第"+str(i)+"段的内容是："+file.paragraphs[i].text)
    temp = file.paragraphs[i].text
    if '根据《中华人民共和国证券投资基金法》和' in temp:
        start = i
        print(temp)
    if '本财务报表由基金管理人上海东方证券资产管理有限公司于' in temp:
        end = i
        print(temp)

text = ""
for i in range(start, end + 1):
    text = text + file.paragraphs[i].text
text = re.sub("第.*页.*年度报告", "", text)
text1 = re.findall("本基金的投资范围.+可以将其纳入投资范围。", text)[0]
text2 = re.findall("可以将其纳入投资范围。(.*?)本财务报表由基金管理人", text)[0]
result["投资范围"] = text1
result["投资比例"] = text2
pd.set_option('display.max_colwidth', -1)

df = pd.DataFrame(pd.Series(result), columns=['content'])
df = df.rename(index={'基金主代码': "基金代码"})
df = df.reindex(
    ['基金名称', "基金简称", "场内简称", "基金代码", '基金类别', '基金管理人', '基金托管人', '基金合同生效日', '基金运作方式', '业绩比较基准', '投资目标', '投资范围', '投资比例',
     '基金分红', '风险收益特征', '风险等级'])
df = df.dropna()
display(df)
name = result['基金简称']
path = 'C:\\Users\\kf40\\Desktop\\智能提取信息\\'+ name +'.xlsx'
df.to_excel(path)